# libraries
library(tidyverse)
library(readxl)
library(stargazer)

##### Data Prep ####

# Load our new data
## data on party control and share of women in congress from 1964-2008
party_control <- read_csv('Our Data/party-women_control.csv')
## public opinion data and policy outcome data
df <- read_excel("Our Data/Main dataset for United States, from Gilens 2012.xlsx")

# Code from Mathisen
# logit transformation function
logitTransform <- function(p) { log(p/(1-p)) }

# New main variables
df$outcome <- df$OUTCOME
df$all <- df$PREDALL_SW # predicted value for all voters

# Calc preferences for males and use logit transformation for odds-ratio
df$male <- df$MALE_FAV/(df$MALE_FAV+df$MALE_OPP)
df$male <- ifelse(df$SWITCHER==1,1-df$male,df$male)
df$male_logit <- logitTransform(df$male)

# same with females
df$female <- df$FEMALE_FAV/(df$FEMALE_FAV+df$FEMALE_OPP)
df$female <- ifelse(df$SWITCHER==1,1-df$female,df$female)
df$female_logit <- logitTransform(df$female)

# recode the outcome to binary
df$outcome <- car::recode(df$outcome, "0=0; 1=1;2=1;3=1;4=1; else=NA") 

# gender policy diff variables
df$genderdiff <- abs(df$male-df$female)
df$genderdiff_nonabs <- df$male-df$female

# Mean centering variables 
df$all_mean <- df$all-mean(df$all,na.rm=T)
df$YEAR_mean <- df$YEAR-mean(df$YEAR,na.rm=T)

# Gender diff, DK (don't know) included
df$male_withDK <- df$MALE_FAV/(df$MALE_FAV+df$MALE_OPP+df$MALE_DK)
df$male_withDK <- ifelse(df$SWITCHER==1,1-df$male_withDK,df$male_withDK)
df$female_withDK <- df$FEMALE_FAV/(df$FEMALE_FAV+df$FEMALE_OPP+df$FEMALE_DK)
df$female_withDK <- ifelse(df$SWITCHER==1,1-df$female_withDK,df$female_withDK)
df$genderdiff_nonabs_withDK <- df$male_withDK-df$female_withDK

# Merge in party_control data into Gilens
main <- df %>%
  left_join(party_control, by = c("YEAR" = "year")) %>%
  select(-ends_with(".x")) %>%
  rename_with(~ gsub("\\.y$", "", .), ends_with(".y"))

# partisan control scale
main <- main %>%
  mutate(
    # Assign numeric values based on party control for each branch
    pres_score = ifelse(pres_control == "D", 0.5, 0),
    house_score = ifelse(house_control == "D", 0.25, 0),
    senate_score = ifelse(senate_control == "D", 0.25, 0),
    
    # Calculate total control scale
    party_control_scale = pres_score + house_score + senate_score
  )

# New Outcome Variable
## have some observations that indicate policy was passed half way through a year
## not sure what to do so I just floored those observations
main <- main %>%
  mutate(OUTCMYEAR = floor(OUTCMYEAR))

## new logic of outcome variable from Gilens
expanded_df <- main %>%
  rowwise() %>%
  mutate(
    outcome_first_year = ifelse(OUTCMYEAR == YEAR, 1, 0),
    outcome_second_year = ifelse(OUTCMYEAR == YEAR + 1, 1, 0)
  ) %>%
  ungroup() %>%
  # Ensure obs_type has no missing values by assigning a default category
  mutate(
    obs_type = case_when(
      outcome_first_year == 1 ~ "first_year",
      outcome_second_year == 1 ~ "second_year",
      TRUE ~ "no_adoption"
    )
  ) %>%
  # Duplicate rows based on obs_type with a default weight of 1 if not second_year
  uncount(weights = ifelse(obs_type == "second_year", 2, 1), .id = "obs_id") %>%
  mutate(
    outcome = case_when(
      obs_id == 1 & obs_type == "first_year" ~ 1,
      obs_id == 1 & obs_type == "second_year" ~ 0,
      obs_id == 2 & obs_type == "second_year" ~ 1,
      TRUE ~ 0
    ),
    YEAR = YEAR + ifelse(obs_id == 2 & obs_type == "second_year", 1, 0),
    weight = ifelse(obs_type == "first_year", 1, 0.5)
  ) %>%
  select(-outcome_first_year, -outcome_second_year, -obs_id, -obs_type)

##### Descriptive Analysis #####

# Figure 1: Density of Gender Differences with Gender Gap
figure1 <- ggplot(expanded_df %>% filter(!is.na(outcome), genderdiff > 0.1), 
       aes(x = genderdiff_nonabs, fill = factor(outcome))) +
  geom_density(alpha = 0.5) +
  labs(x = "Gender Difference (Male - Female)", fill = "Policy Outcome") +
  scale_fill_manual(values = c("blue", "green"), labels = c("Not Passed", "Passed")) +
  theme_minimal() +
  theme(axix.text.x = element_text(size = 14), # Adjust size of x-axis text
        axis.text.y = element_text(size = 14), # Adjust size of y-axis text
        axis.title.x = element_text(size = 20), # Adjust size of x-axis label
        axis.title.y = element_text(size = 20), # Adjust size of y-axis label
        legend.title = element_text(size = 14), # Adjust size of legend title
        legend.text = element_text(size = 14)) # Adjust size of legend text
ggsave("figures/figure_1.png", figure1, width = 16, height = 9, dpi = 300)

# Figure 2: Density of Gender Differences with Gender Gap and Party Controls
## Define party control categories with party_control_scale
expanded_df <- expanded_df %>%
  mutate(party_control_category = case_when(
    party_control_scale == 0 ~ "Full Republican",
    party_control_scale == 0.25 ~ "Mostly Republican",
    party_control_scale == 0.5 ~ "Divided",
    party_control_scale == 0.75 ~ "Mostly Democrat",
    party_control_scale == 1 ~ "Full Democrat"
  ))


## Density plot faceted by outcome and detailed party control levels, excluding NA values
figure2 <- ggplot(expanded_df %>% filter(!is.na(outcome), genderdiff > 0.1, !is.na(party_control_category)), 
       aes(x = genderdiff_nonabs, fill = factor(outcome))) +
  geom_density(alpha = 0.5) +
  facet_grid(party_control_category ~ outcome, 
             labeller = labeller(outcome = c(`0` = "Not Passed", `1` = "Passed"))) +
  labs(x = "Gender Difference (Male - Female)",
       y = "Density",
       fill = "Policy Outcome") +
  scale_fill_manual(values = c("blue", "green"), labels = c("Not Passed", "Passed")) +
  theme_minimal() +
  theme(axix.text.x = element_text(size = 14), # Adjust size of x-axis text
        axis.text.y = element_text(size = 14), # Adjust size of y-axis text
        axis.title.x = element_text(size = 20), # Adjust size of x-axis label
        axis.title.y = element_text(size = 20), # Adjust size of y-axis label
        legend.title = element_text(size = 14), # Adjust size of legend title
        legend.text = element_text(size = 14)) # Adjust size of legend text
ggsave("figures/figure_2.png", figure2, width = 16, height = 9, dpi = 300)


##### Regression Analysis #####

# new dataframe to only analyze policies with a gender gap
gendered <- expanded_df %>%
  filter(genderdiff > 0.1)

# Model 1: Treatment Effect
m1 <- glm(outcome ~ genderdiff_nonabs * party_control_scale, data = gendered,
          family = binomial(link = "logit"))

# Model 2: Model with controls

## Create a trend variable for each Congress session (e.g., cumulative female count)
gendered <- gendered %>%
  group_by(congress) %>%
  mutate(trend_female_representation = mean(house_demcount + senate_demcount + house_repcount + senate_repcount, na.rm = TRUE))


## regression model with controls
m2 <- glm(outcome ~ genderdiff_nonabs * party_control_scale + 
            trend_female_representation + all + XL_AREA,
          data = gendered, family = binomial(link = "logit"))

# Table 2: Logistic Regression Coefficients
table2 <- stargazer::stargazer(m1, m2, 
                               type = "latex",  
                               title = "Main Results",
                               dep.var.labels = c("Policy Passage (0,1)"),  
                               covariate.labels = c(
                                 "Intercept",
                                 "Gender Diff (Male-Female)",# Relabel genderdiff_nonabs
                                 "Party Control",# Relabel party_control_scale
                                 "Female Representation",# Relabel femal representation
                                 "Public Support (All)",# Relabel all
                                 "Gender Diff * Party Control"# Relabel interaction
                               ),
                               omit = "XL_AREA",  # Exclude XL_AREA entirely from the output
                               no.space = TRUE,# Remove extra spacing
                               intercept.bottom = FALSE,# Intercept at the top
                               align = TRUE  # Align columns for neat formatting
)

# Figure 3

# Define a range of values for genderdiff_nonabs and party_control_scale
pred_grid <- expand.grid(
  genderdiff_nonabs = seq(-1, 1, length.out = 50),  # Range of gender differences
  party_control_scale = seq(0, 1, by = 0.25)       # Range of party control
)

# Add average values for other predictors
pred_grid$trend_female_representation <- mean(gendered$trend_female_representation, na.rm = TRUE)
pred_grid$all <- mean(gendered$all, na.rm = TRUE)

# Set XL_AREA to the most frequent category 
most_frequent_XL_AREA <- names(sort(table(gendered$XL_AREA), decreasing = TRUE))[1]
pred_grid$XL_AREA <- most_frequent_XL_AREA

# Predict probabilities and calculate standard errors
predictions <- predict(m2, newdata = pred_grid, type = "link", se.fit = TRUE)

# Convert log-odds to probabilities and calculate confidence intervals
pred_grid$predicted_prob <- plogis(predictions$fit)
pred_grid$lower_ci <- plogis(predictions$fit - 1.96 * predictions$se.fit)  # Lower bound of 95% CI
pred_grid$upper_ci <- plogis(predictions$fit + 1.96 * predictions$se.fit)  # Upper bound of 95% CI

## Plot with confidence intervals
figure3 <- ggplot(pred_grid, aes(x = genderdiff_nonabs, y = predicted_prob, color = factor(party_control_scale), group = party_control_scale)) +
  geom_line(size = 1) +  # Predicted probability lines
  geom_ribbon(aes(ymin = lower_ci, ymax = upper_ci, fill = factor(party_control_scale)), alpha = 0.2, color = NA) +  # Confidence intervals
  labs(
    x = "Gender Difference (Male - Female)",
    y = "Predicted Probability of Policy Passage",
    color = "Party Control Scale",
    fill = "Party Control Scale"
  ) +
  scale_color_manual(values = c("blue", "gray", "gray", "gray", "red"), 
                     labels = c("Full Democrat (0)", "Mostly Democrat (0.25)", "Mixed (0.5)", "Mostly Republican (0.75)", "Full Republican (1)")) +
  scale_fill_manual(values = c("blue", "gray", "gray", "gray", "red"), 
                    labels = c("Full Democrat (0)", "Mostly Democrat (0.25)", "Mixed (0.5)", "Mostly Republican (0.75)", "Full Republican (1)")) +
  theme_minimal() +
  theme(axix.text.x = element_text(size = 14), # Adjust size of x-axis text
        axis.text.y = element_text(size = 14), # Adjust size of y-axis text
        axis.title.x = element_text(size = 20), # Adjust size of x-axis label
        axis.title.y = element_text(size = 20), # Adjust size of y-axis label
        legend.title = element_text(size = 14), # Adjust size of legend title
        legend.text = element_text(size = 14)) # Adjust size of legend text
ggsave("figures/figure_3.png", figure3, width = 16, height = 9, dpi = 300)


# Male and Female Models

## needed to add this control to the main dataset
expanded_df1 <- expanded_df%>%
  group_by(congress) %>%
  mutate(trend_female_representation = mean(house_demcount + senate_demcount + house_repcount + senate_repcount, na.rm = TRUE))

## regression models
m3<- glm(outcome ~ male + party_control_scale +
                trend_female_representation + all + XL_AREA,
              data = expanded_df1, family = binomial(link = "logit"))

m4 <- glm(outcome ~ female + party_control_scale +
            trend_female_representation + all + XL_AREA,
          data = expanded_df1, family = binomial(link = "logit"))


## Preparing predicted probabilities
female_grid <- expand.grid(
  female = seq(min(gendered$female, na.rm = TRUE), max(gendered$female, na.rm = TRUE), length.out = 100),
  trend_female_representation = mean(gendered$trend_female_representation, na.rm = TRUE),
  all = mean(gendered$all, na.rm = TRUE),
  party_control_scale = seq(0, 1, by = 0.25)
)

female_grid$XL_AREA <- most_frequent_XL_AREA
female_grid$model <- "Female Preferences"

# Predictions for females
female_predictions <- predict(m4, newdata = female_grid, type = "response", se.fit = TRUE)
female_grid$predicted_prob <- plogis(female_predictions$fit)
female_grid$lower_ci <- plogis(female_predictions$fit - 1.96 * female_predictions$se.fit)
female_grid$upper_ci <- plogis(female_predictions$fit + 1.96 * female_predictions$se.fit)

male_grid <- expand.grid(
  male = seq(min(gendered$male, na.rm = TRUE), max(gendered$male, na.rm = TRUE), length.out = 100),
  trend_female_representation = mean(gendered$trend_female_representation, na.rm = TRUE),
  all = mean(gendered$all, na.rm = TRUE),
  party_control_scale = seq(0, 1, by = 0.25)
)

male_grid$XL_AREA <- most_frequent_XL_AREA
male_grid$model <- "Male Preferences"

# Predictions for males
male_predictions <- predict(m3, newdata = male_grid, type = "response", se.fit = TRUE)
male_grid$predicted_prob <- plogis(male_predictions$fit)
male_grid$lower_ci <- plogis(male_predictions$fit - 1.96 * male_predictions$se.fit)
male_grid$upper_ci <- plogis(male_predictions$fit + 1.96 * male_predictions$se.fit)

## Combining grid and plotting
combined_grid <- rbind(
  male_grid %>% rename(preferences = male),
  female_grid %>% rename(preferences = female)
)

figure4 <- ggplot(combined_grid, aes(x = preferences, y = predicted_prob, color = model, fill = model)) +
  geom_line(size = 1) +
  geom_ribbon(aes(ymin = lower_ci, ymax = upper_ci), alpha = 0.2, color = NA) +
  labs(
    x = "% Support",
    y = "Predicted Probability",
    color = "Model",
    fill = "Model"
  ) +
  theme_minimal() +
  scale_color_manual(values = c("pink", "lightblue")) +
  scale_fill_manual(values = c("pink", "lightblue")) +
  theme(axix.text.x = element_text(size = 14), # Adjust size of x-axis text
        axis.text.y = element_text(size = 14), # Adjust size of y-axis text
        axis.title.x = element_text(size = 20), # Adjust size of x-axis label
        axis.title.y = element_text(size = 20), # Adjust size of y-axis label
        legend.title = element_text(size = 14), # Adjust size of legend title
        legend.text = element_text(size = 14)) # Adjust size of legend text
ggsave("figures/figure_4.png", figure4, width = 16, height = 9, dpi = 300)

#### Issue Area Analysis

#Create Income Difference Variables
expanded_df$inc9050_diff <- abs(expanded_df$PRED90_SW-expanded_df$PRED50_SW)
expanded_df$inc9010_diff <- abs(expanded_df$PRED90_SW-expanded_df$PRED10_SW)
expanded_df$inc5010_diff <- abs(expanded_df$PRED50_SW-expanded_df$PRED10_SW)

#Create Race Difference Variables
expanded_df$white <- expanded_df$WHITE_FAV / (expanded_df$WHITE_FAV+expanded_df$WHITE_OPP)
expanded_df$white <- ifelse(expanded_df$SWITCHER==1,1-expanded_df$white,expanded_df$white)
expanded_df$black <- expanded_df$BLACK_FAV / (expanded_df$BLACK_FAV+expanded_df$BLACK_OPP)
expanded_df$black <- ifelse(expanded_df$SWITCHER==1,1-expanded_df$black,expanded_df$black)

expanded_df$race_diff <- abs(expanded_df$white - expanded_df$black)

#Splitting up by issue areas:

#Foreign policy
foreign_policy <- expanded_df %>%
  filter(XL_AREA == "foreign pol" | XL_AREA == "defense" | XL_AREA == "terrorism")
m5<- lm(outcome ~ genderdiff_nonabs * party_control_scale + all, data = foreign_policy)

#Social welfare
social_welfare <- expanded_df %>%
  filter(XL_AREA == "soc welfare" | XL_AREA == "health" | XL_AREA == "education")
m6<- lm(outcome ~ genderdiff_nonabs * party_control_scale + all, data = social_welfare)

#Economic Policy
economic_policy <- expanded_df %>%
  filter(XL_AREA == "econ & labor" | XL_AREA == "taxation" | XL_AREA == "budget")
m7<- lm(outcome ~ genderdiff_nonabs * party_control_scale + all, data = economic_policy)

#Religious Issues
religious_issues <- expanded_df %>%
  filter(XL_AREA == "religious")
m8<- lm(outcome ~ genderdiff_nonabs * party_control_scale + all, data = religious_issues)

#Table 3: Descriptive Issue Preference Differences

#Foreign Policy Average Differences
f1 <- foreign_policy %>% dplyr::summarise(sex=round(mean(genderdiff,na.rm=T),3)*100,
                                                   inc50=round(mean(inc9050_diff,na.rm=T),3)*100,
                                                   inc10=round(mean(inc9010_diff,na.rm=T),3)*100,
                                                   race=round(mean(race_diff, na.rm=T),3)*100)

#Social Welfare Average Differences
s1 <- social_welfare %>% dplyr::summarise(sex=round(mean(genderdiff,na.rm=T),3)*100,
                                          inc50=round(mean(inc9050_diff,na.rm=T),3)*100,
                                          inc10=round(mean(inc9010_diff,na.rm=T),3)*100,
                                          race=round(mean(race_diff, na.rm=T),3)*100)

#Economic Policy Average Differences
e1 <- economic_policy %>% dplyr::summarise(sex=round(mean(genderdiff,na.rm=T),3)*100,
                                         inc50=round(mean(inc9050_diff,na.rm=T),3)*100,
                                         inc10=round(mean(inc9010_diff,na.rm=T),3)*100,
                                         race=round(mean(race_diff, na.rm=T),3)*100)

#Religious Issues Average Differences
r1 <- religious_issues %>% dplyr::summarise(sex=round(mean(genderdiff,na.rm=T),3)*100,
                                            inc50=round(mean(inc9050_diff,na.rm=T),3)*100,
                                            inc10=round(mean(inc9010_diff,na.rm=T),3)*100,
                                            race=round(mean(race_diff, na.rm=T),3)*100)

means <- rbind(f1, s1, e1, r1)
means <- data.frame(t(means))
rownames(means) <- c("Men vs. women", "Middle income (P50) vs. high income (P90)", "Low income (P10) vs. high income (P90)", "Race (Black vs. White)")
colnames(means) <- c("Foreign Policy", "Social Welfare", "Economic Policy", "Religious Issues")

#Generate Table 3: Summary of average preference differences
stargazer(means, type = "text", digits = 1, digits.extra = 2, summary = FALSE)

# Table 4: LPM Regression Results
stargazer(m6, m6, m7, m8, 
          title="Policy Responsiveness by Issue Area", 
          align=TRUE, 
          column.labels = c("Foreign Policy", "Social Welfare", "Economic Policy", "Religious Issues"), 
          covariate.labels = c("Gender Difference", "Party Control", "Overall Support", "Gender Difference * Party Control"))





